引言

这里直接读取作者给定的第一个病人的Gene expression analysis: discovery patient tumor,用的是 10x genomics 3’ Chromium expression assay.

Following sequence alignment and filtering, a total of 7431 tumor cells

主要是比较免疫治疗前后的肿瘤细胞的免疫变化,免疫治疗前有2243 cells ,免疫治疗后是3个时间点,细胞数量多一点,是5188 cells

载入必要的R包

需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat

因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。

参考:http://www.bio-info-trainee.com/3727.html

# 下面代码不运行。
# Enter commands in R (or R studio, if installed)
# Install the devtools package from Hadley Wickham
install.packages('devtools')
# Replace '2.3.0' with your desired version
devtools::install_version(package = 'Seurat', version = package_version('2.3.0'))

library(Seurat)

加载R包

rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))

读入文章关于第一个病人的tumor表达矩阵

start_time <- Sys.time()
# 如果觉得这里较慢,可以使用 data.table 包的 fread函数。
raw_dataTumor <- read.csv('../Output_2018-03-12/GSE117988_raw.expMatrix_Tumor.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time
## Time difference of 44.90509 secs
# 通常电脑一分钟可以搞定。


dim(raw_dataTumor) # 7,431 cells and 21,861 genes - already filtered
## [1] 21861  7431
dataTumor <- log2(1 + sweep(raw_dataTumor, 2, median(colSums(raw_dataTumor))/colSums(raw_dataTumor), '*')) # Normalization
cellTypes <- sapply(colnames(dataTumor), function(x) ExtractField(x, 2, '[.]'))
cellTypes <-ifelse(cellTypes == '1', 'Tumor_Before', 'Tumor_AcquiredResistance')
table(cellTypes) 
## cellTypes
## Tumor_AcquiredResistance             Tumor_Before 
##                     5188                     2243

表达矩阵的质量控制

简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。

# 可以看到,2万多的基因里面,
# 绝大部分基因只在七千多细胞的500个不到的表达,比外周血数据好一点。
fivenum(apply(dataTumor,1,function(x) sum(x>0) ))
##    VP2 GPRIN2   EML3 ZNF140  RPLP1 
##      1      8    103    566   7431
boxplot(apply(dataTumor,1,function(x) sum(x>0) ))

# 可以看到,七千多细胞里面
# 绝大部分细胞只能检测不到2000个基因,已经算是不错的了
fivenum(apply(dataTumor,2,function(x) sum(x>0) ))
## GGAACTTAGGAATCGC.1 TACGGTACAAGCCGCT.2 CCTACCAAGCGTGAGT.1 
##              192.0             1059.0             1380.0 
## TGAGCCGAGACTAGAT.2 GATCGTAGTCATATGC.2 
##             1971.5             5888.0
hist(apply(dataTumor,2,function(x) sum(x>0) ))

然后创建Seurat的对象

# Create Seurat object
tumor <- CreateSeuratObject(raw.data = dataTumor, min.cells = 1, min.genes = 0, project = '10x_Tumor') # already normalized
tumor # 21,861 genes and 7,431 cells
## An object of class seurat in project 10x_Tumor 
##  21861 genes across 7431 samples.
# 可以看到上面创建Seurat对象的那些参数并没有过滤基因或者细胞。


# Add meta.data (nUMI and cellTypes)
tumor <- AddMetaData(object = tumor, metadata = apply(raw_dataTumor, 2, sum), col.name = 'nUMI_raw')
tumor <- AddMetaData(object = tumor, metadata = cellTypes, col.name = 'cellTypes')

一些质控

这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 cellTypes 属性,所以可以用来进行可视化。

这里是:‘cellTypes’,就是免疫治疗前后。

sce=tumor
VlnPlot(object = sce, 
        features.plot = c("nGene", "nUMI"), 
        group.by = 'cellTypes', nCol = 2)

GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")

可以看看高表达量基因是哪些

tail(sort(Matrix::rowSums(sce@raw.data)))
##     RPS2   MT-CO1   MT-CO3   MT-CO2    RPS18   MALAT1 
## 34406.98 35104.22 35400.39 36274.50 36712.57 47996.87
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(sce@raw.data),decreasing = T))
GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])

# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)

最后标准聚类可视化

很简单的流程,先ScaleData,再FindVariableGenes,然后根据找到的高变异基因进行RunPCA,再根据PCA结果进行FindClusters即可,最后再RunTSNE后进行可视化。

start_time <- Sys.time()
# 最耗费时间的步骤在这里。
tumor <- ScaleData(object = tumor, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## 
## Time Elapsed:  36.7578909397125 secs
tumor <- FindVariableGenes(object = tumor, 
                           mean.function = ExpMean, 
                           dispersion.function = LogVMR, 
                           x.low.cutoff = 0.0125, 
                           x.high.cutoff = 3, 
                           y.cutoff = 0.5)

head(tumor@var.genes)
## [1] "ISG15"    "TNFRSF18" "TNFRSF4"  "MMP23B"   "TNFRSF14" "ENO1"
length(tumor@var.genes)
## [1] 873
tumor <- RunPCA(object = tumor, 
                pc.genes = tumor@var.genes)
## [1] "PC1"
##  [1] "HLA-DRA"  "CD74"     "S100A11"  "TYROBP"   "HLA-DPA1" "HLA-DPB1"
##  [7] "HLA-DRB1" "VIM"      "C1QB"     "C1QA"     "LGALS1"   "SAT1"    
## [13] "SRGN"     "CTSB"     "FCER1G"   "SOD2"     "NPC2"     "CD68"    
## [19] "HLA-DQB1" "HLA-DQA1" "APOE"     "APOC1"    "AIF1"     "LAPTM5"  
## [25] "CTSS"     "HLA-DRB5" "HLA-B"    "C1QC"     "PSAP"     "TYMP"    
## [1] ""
##  [1] "ZFAS1"   "SOX4"    "TUBA1A"  "HES6"    "TUBB"    "HMGB2"   "UBE2S"  
##  [8] "KRT20"   "CHGA"    "HMGB1"   "CCK"     "VIP"     "HSPD1"   "CACYBP" 
## [15] "HSPA8"   "FAM46A"  "GADD45G" "MRPL18"  "KRT10"   "DNAJA1"  "CHORDC1"
## [22] "DNAJB6"  "TAC3"    "FKBP4"   "SNHG15"  "KPNA2"   "CENPF"   "HSPE1"  
## [29] "TOP2A"   "NUSAP1" 
## [1] ""
## [1] ""
## [1] "PC2"
##  [1] "TUBA1A"  "HSPA8"   "HMGB2"   "BASP1"   "HSPD1"   "DNAJB6"  "KPNA2"  
##  [8] "HSPH1"   "UBE2S"   "UBE2C"   "CACYBP"  "CENPF"   "DNAJA1"  "HSPA1A" 
## [15] "UBC"     "ARL6IP1" "CHORDC1" "HSPB1"   "ASPM"    "BIRC5"   "ZFAND2A"
## [22] "FKBP4"   "C1QB"    "NUF2"    "TOP2A"   "C1QA"    "CCNB1"   "ZFAS1"  
## [29] "TYROBP"  "NDC80"  
## [1] ""
##  [1] "IGFBP7"  "S100A16" "IFITM3"  "SELM"    "MGP"     "SPARC"   "CD44"   
##  [8] "CALD1"   "S100A1"  "IFI27"   "NNMT"    "SPARCL1" "TM4SF1"  "CAV1"   
## [15] "LAMA4"   "RBP7"    "COL6A2"  "PTRF"    "EMP1"    "LGALS7B" "CD59"   
## [22] "IFITM2"  "COL1A1"  "NEAT1"   "THY1"    "TIMP1"   "PRSS23"  "COL4A2" 
## [29] "S100A13" "IGFBP4" 
## [1] ""
## [1] ""
## [1] "PC3"
##  [1] "HSPA1A"   "HSPB1"    "HSPH1"    "DNAJB1"   "IGFBP7"   "SERPINH1"
##  [7] "DNAJA1"   "HSPA8"    "HSPA6"    "HSPA1B"   "HSPD1"    "IFITM3"  
## [13] "UBC"      "ZFAND2A"  "DNAJB6"   "ATF3"     "TUBA1A"   "CHORDC1" 
## [19] "SPARC"    "CACYBP"   "CALD1"    "JUN"      "BAG3"     "NNMT"    
## [25] "FKBP4"    "MRPL18"   "HSPE1"    "KPNA2"    "SPARCL1"  "DNAJA4"  
## [1] ""
##  [1] "S100A1"    "CD44"      "SELM"      "CYBA"      "LGALS7B"  
##  [6] "RBP7"      "S100A16"   "MT-ND3"    "NEAT1"     "PSTPIP1"  
## [11] "LGALS7"    "HIST1H1C"  "SH3BGRL3"  "LINC00632" "ADAMTS7"  
## [16] "KLK11"     "CRABP1"    "S100A14"   "HES6"      "PHLDA2"   
## [21] "TYROBP"    "PYDC1"     "AQP3"      "C1QB"      "C1QA"     
## [26] "H1F0"      "MT-ND6"    "CHST8"     "CD68"      "SNHG19"   
## [1] ""
## [1] ""
## [1] "PC4"
##  [1] "HSPA1A"   "DNAJB1"   "ZFAND2A"  "HSPA1B"   "HSPB1"    "HSPA6"   
##  [7] "SERPINH1" "ATF3"     "HSPH1"    "JUN"      "SNHG15"   "CHORDC1" 
## [13] "DNAJA4"   "MRPL18"   "NR4A1"    "CLK1"     "IER2"     "CACYBP"  
## [19] "RSRC2"    "DNAJB6"   "HSPD1"    "FKBP4"    "LANCL2"   "BAG3"    
## [25] "HSPA4L"   "ARC"      "DNAJA1"   "MXD1"     "DUSP1"    "EGR4"    
## [1] ""
##  [1] "UBE2C"   "CENPF"   "BIRC5"   "TOP2A"   "CENPA"   "HMGB2"   "CCNB2"  
##  [8] "ASPM"    "CCNB1"   "UBE2S"   "NUF2"    "CDKN3"   "AURKA"   "CDC20"  
## [15] "PIF1"    "NUSAP1"  "AURKB"   "KPNA2"   "HMGB1"   "CDK1"    "KNSTRN" 
## [22] "NDC80"   "ARL6IP1" "TUBB"    "MXD3"    "TMSB4X"  "TUBA1C"  "SPC25"  
## [29] "TMSB10"  "TGIF1"  
## [1] ""
## [1] ""
## [1] "PC5"
##  [1] "TOP2A"   "CD44"    "UBE2C"   "CENPF"   "CCNB1"   "BIRC5"   "JUN"    
##  [8] "CCNB2"   "ASPM"    "S100A1"  "PIF1"    "SELM"    "S100A16" "NUSAP1" 
## [15] "AURKA"   "CDC20"   "NUF2"    "RBP7"    "DNAJB1"  "CDKN3"   "HSPA1B" 
## [22] "CENPA"   "AURKB"   "IER2"    "ATF3"    "COTL1"   "DDIT4"   "KPNA2"  
## [29] "HSPA1A"  "HSPA6"  
## [1] ""
##  [1] "VIP"        "CHGA"       "CD63"       "ENO1"       "CCK"       
##  [6] "ALDOA"      "BASP1"      "CBLN2"      "KRT20"      "TMEM176B"  
## [11] "AMBN"       "TMEM176A"   "TPPP3"      "AP000459.7" "CST3"      
## [16] "RGS4"       "TXN"        "MANF"       "S100A6"     "CD36"      
## [21] "NELL1"      "ANXA1"      "CNN3"       "GPNMB"      "FABP7"     
## [26] "MMP2"       "FABP5"      "TUBA1A"     "CTSL"       "LGALS3"    
## [1] ""
## [1] ""
tumor <- RunTSNE(object = tumor, 
                 dims.use = 1:10, 
                 perplexity = 25)


TSNEPlot(tumor, group.by = 'cellTypes', colors.use = c('#EF8A62', '#67A9CF'))

end_time <- Sys.time()
end_time - start_time
## Time difference of 1.322714 mins
#  这里文章里面没有运行 FindClusters ,仅仅是使用 cellTypes

输出seurat结果后面使用

start_time <- Sys.time()
save(tumor,file = 'patient1.tumor.output.Rdata')
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.088687 mins
# 这个步骤会输出文件 

同样的,也是需要marker基因来把肿瘤细胞进行分类,最后文章效果图是:

需要的marker基因也是附件,如下:

显示运行环境

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Seurat_2.3.4  Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.2.0
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15          colorspace_1.4-0    class_7.3-14       
##   [4] modeltools_0.2-22   ggridges_0.5.1      mclust_5.4.1       
##   [7] rprojroot_1.3-2     htmlTable_1.13.1    base64enc_0.1-3    
##  [10] rstudioapi_0.9.0    proxy_0.4-22        npsurv_0.4-0       
##  [13] flexmix_2.3-14      bit64_0.9-7         codetools_0.2-15   
##  [16] splines_3.5.1       R.methodsS3_1.7.1   lsei_1.2-0         
##  [19] robustbase_0.93-3   knitr_1.21          jsonlite_1.6       
##  [22] Formula_1.2-3       ica_1.0-2           cluster_2.0.7-1    
##  [25] kernlab_0.9-27      png_0.1-7           R.oo_1.22.0        
##  [28] compiler_3.5.1      httr_1.3.1          backports_1.1.3    
##  [31] assertthat_0.2.0    lazyeval_0.2.1      lars_1.2           
##  [34] acepack_1.4.1       htmltools_0.3.6     tools_3.5.1        
##  [37] bindrcpp_0.2.2      igraph_1.2.2        gtable_0.2.0       
##  [40] glue_1.3.0          RANN_2.6            reshape2_1.4.3     
##  [43] dplyr_0.7.8         Rcpp_1.0.0          gdata_2.18.0       
##  [46] ape_5.2             nlme_3.1-137        iterators_1.0.10   
##  [49] fpc_2.2-3           gbRd_0.4-11         lmtest_0.9-36      
##  [52] xfun_0.4            stringr_1.3.1       irlba_2.3.2        
##  [55] gtools_3.8.1        DEoptimR_1.0-8      MASS_7.3-50        
##  [58] zoo_1.8-4           scales_1.0.0        doSNOW_1.0.16      
##  [61] parallel_3.5.1      RColorBrewer_1.1-2  yaml_2.2.0         
##  [64] reticulate_1.10     pbapply_1.3-4       gridExtra_2.3      
##  [67] rpart_4.1-13        segmented_0.5-3.0   latticeExtra_0.6-28
##  [70] stringi_1.2.4       foreach_1.4.4       checkmate_1.9.1    
##  [73] caTools_1.17.1.1    bibtex_0.4.2        Rdpack_0.10-1      
##  [76] SDMTools_1.1-221    rlang_0.3.1         pkgconfig_2.0.2    
##  [79] dtw_1.20-1          prabclus_2.2-6      bitops_1.0-6       
##  [82] evaluate_0.12       lattice_0.20-35     ROCR_1.0-7         
##  [85] purrr_0.3.0         bindr_0.1.1         labeling_0.3       
##  [88] htmlwidgets_1.3     bit_1.1-14          tidyselect_0.2.5   
##  [91] plyr_1.8.4          magrittr_1.5        R6_2.3.0           
##  [94] snow_0.4-3          gplots_3.0.1.1      Hmisc_4.2-0        
##  [97] pillar_1.3.1        foreign_0.8-70      withr_2.1.2        
## [100] fitdistrplus_1.0-11 mixtools_1.1.0      survival_2.42-3    
## [103] nnet_7.3-12         tsne_0.1-3          tibble_2.0.1       
## [106] crayon_1.3.4        hdf5r_1.0.1         KernSmooth_2.23-15 
## [109] rmarkdown_1.10      grid_3.5.1          data.table_1.12.0  
## [112] metap_1.0           digest_0.6.18       diptest_0.75-7     
## [115] tidyr_0.8.1         R.utils_2.7.0       stats4_3.5.1       
## [118] munsell_0.5.0